//
package main

import (
	"dense_spider/core/common/extract"
	"dense_spider/core/common/filter"
	"dense_spider/core/history"
	"dense_spider/core/page_processer"
	"dense_spider/core/spider"
	"dense_spider/extension/pipeline"
)

func main() {
	// spider input:
	//  PageProcesser ;
	//  task name used in Pipeline for record;
	ex := make([]extract.ExtractRule, 0)
	ex = append(ex, extract.NewExtractRule("Title", "//*[@id=\"news_title\"]/a"))
	ex = append(ex, extract.NewExtractRule("Content", "//*[@id=\"news_body\"]"))
	userAgent := "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36"
	sp := spider.NewSpider(page_processer.NewHtmlPageProcesser(), "cnblogs", "http://news.cnblogs.com/n/page/10/", ex, userAgent)
	sp.SetMaxCrawlDepth(1)
	sp.SetThreadnum(4)

	sp.AddExcludeFilter(filter.NewRegexFilter("(\\.jpg|\\.css|\\.js|\\.gif|\\.jpeg|\\.png|\\.ico)"))

	sp.AddIncludeFilter(filter.NewRegexFilter("news.cnblogs.com"))

	sp.SetHistory(history.NewMongodbHistory())
	// GetWithParams Params:
	//  1. Url.
	//  2. Responce type is "html" or "json" or "jsonp" or "text".
	//  3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline.
	//  4. The method is POST or GET.
	//  5. The postdata is body string sent to sever.
	//  6. The header is header for http request.
	//  7. Cookies

	sp.AddPipeline(pipeline.NewPipelineMongodb())
	sp.Run()

	println("-----------------------------------spider.Get---------------------------------")

}
